I first coded emojies (like :), =), ((, etc.) as a separate feature, but rejected this idea, because:
I dropped out all verbatims with no practical meaning (i.e. including only brand names, stopwords, punctuation and word корм)
Start with loading the required libraries.
# Working with files, tables and vectors
import pandas as pd
import numpy as np
import os
import sys
import wget
import zipfile
from collections import Counter
import pickle
# Visualizations
import seaborn as sns
from plotly import tools
import plotly.offline as pyo
import plotly.graph_objs as go
pyo.init_notebook_mode(connected=True)
from wordcloud import WordCloud, ImageColorGenerator
from PIL import Image
import matplotlib.pyplot as plt
%matplotlib inline
# Working with texts
from gensim.models import Word2Vec
import gensim
import nltk
from nltk.corpus import stopwords
import string
from ufal.udpipe import Model, Pipeline
import re
from typing import List
# Models
from sklearn.cluster import KMeans
from sklearn.manifold import TSNE
from sklearn.mixture import GaussianMixture as GMM
from sklearn.decomposition import PCA
# Utility
from time import time
from tqdm._tqdm_notebook import tqdm_notebook
files = [file for file in os.listdir() if file.endswith('.xls')]
print('There are %i SAM files in folder:\n'%len(files),'\n '.join(files))
cats = pd.read_excel(files[0], encoding='utf-16', header=None, skiprows=1)
cats.columns = ['brand','verbatim']
cats['category'] = 'CAT'
dogs = pd.read_excel(files[1], encoding='utf-16', header=None, skiprows=1)
dogs.columns = ['brand','verbatim']
dogs['category'] = 'DOG'
df = pd.concat([cats,dogs], sort=False, ignore_index=True)
print('Combined data frame contains %i verbatims: %i for Cats and %i for Dogs.\n\nBrand split:\n'%
(len(df),len(df[df.category=='CAT']),len(df[df.category=='DOG'])),
round(df.brand.value_counts(dropna=False)/len(df)*100,1).astype(str)+"%")
Observation: questionnaire was definately quoted having same number of respondents for each brand. We observe twice more answers for RC, PO and PF because they are presented in both CAT and DOG categories.
In order to work with natural text, we need some kind of number representation of sentences.
This representation shall be unique for each text, but close by meaning verbatims shall have close representation for clustering.
In order to achieve this, we'll use Word2Vec technique.
But before we transform words into vector representations, we need to lemmatize words (return them into primary form), remove punctuation and remove stopwords, brand names. This is called text pre-processing
# Here we just load a prepared list of stopwords
nltk.download('stopwords')
We'll use RusVectores preprocessing algorithms for text preprocessing with addition of a couple of our euristics (removing stopwords and brand names)
First things first! Define preprocessing functions...
def num_replace(word):
newtoken = 'x' * len(word)
return newtoken
def clean_token(token, misc):
"""
:param token: токен (строка)
:param misc: содержимое поля "MISC" в CONLLU (строка)
:return: очищенный токен (строка)
"""
out_token = token.strip().replace(' ', '')
if token == 'Файл' and 'SpaceAfter=No' in misc:
return None
return out_token
def clean_lemma(lemma, pos):
"""
:param lemma: лемма (строка)
:param pos: часть речи (строка)
:return: очищенная лемма (строка)
"""
out_lemma = lemma.strip().replace(' ', '').replace('_', '').lower()
if '|' in out_lemma or out_lemma.endswith('.jpg') or out_lemma.endswith('.png'):
return None
if pos != 'PUNCT':
if out_lemma.startswith('«') or out_lemma.startswith('»'):
out_lemma = ''.join(out_lemma[1:])
if out_lemma.endswith('«') or out_lemma.endswith('»'):
out_lemma = ''.join(out_lemma[:-1])
if out_lemma.endswith('!') or out_lemma.endswith('?') or out_lemma.endswith(',') \
or out_lemma.endswith('.'):
out_lemma = ''.join(out_lemma[:-1])
return out_lemma
def list_replace(search, replacement, text):
search = [el for el in search if el in text]
for c in search:
text = text.replace(c, replacement)
return text
def unify_sym(text): # принимает строку в юникоде
text = list_replace \
('\u00AB\u00BB\u2039\u203A\u201E\u201A\u201C\u201F\u2018\u201B\u201D\u2019', '\u0022', text)
text = list_replace \
('\u2012\u2013\u2014\u2015\u203E\u0305\u00AF', '\u2003\u002D\u002D\u2003', text)
text = list_replace('\u2010\u2011', '\u002D', text)
text = list_replace \
(
'\u2000\u2001\u2002\u2004\u2005\u2006\u2007\u2008\u2009\u200A\u200B\u202F\u205F\u2060\u3000',
'\u2002', text)
text = re.sub('\u2003\u2003', '\u2003', text)
text = re.sub('\t\t', '\t', text)
text = list_replace \
(
'\u02CC\u0307\u0323\u2022\u2023\u2043\u204C\u204D\u2219\u25E6\u00B7\u00D7\u22C5\u2219\u2062',
'.', text)
text = list_replace('\u2217', '\u002A', text)
text = list_replace('…', '...', text)
text = list_replace('\u2241\u224B\u2E2F\u0483', '\u223D', text)
text = list_replace('\u00C4', 'A', text) # латинская
text = list_replace('\u00E4', 'a', text)
text = list_replace('\u00CB', 'E', text)
text = list_replace('\u00EB', 'e', text)
text = list_replace('\u1E26', 'H', text)
text = list_replace('\u1E27', 'h', text)
text = list_replace('\u00CF', 'I', text)
text = list_replace('\u00EF', 'i', text)
text = list_replace('\u00D6', 'O', text)
text = list_replace('\u00F6', 'o', text)
text = list_replace('\u00DC', 'U', text)
text = list_replace('\u00FC', 'u', text)
text = list_replace('\u0178', 'Y', text)
text = list_replace('\u00FF', 'y', text)
text = list_replace('\u00DF', 's', text)
text = list_replace('\u1E9E', 'S', text)
currencies = list \
(
'\u20BD\u0024\u00A3\u20A4\u20AC\u20AA\u2133\u20BE\u00A2\u058F\u0BF9\u20BC\u20A1\u20A0\u20B4\u20A7\u20B0\u20BF\u20A3\u060B\u0E3F\u20A9\u20B4\u20B2\u0192\u20AB\u00A5\u20AD\u20A1\u20BA\u20A6\u20B1\uFDFC\u17DB\u20B9\u20A8\u20B5\u09F3\u20B8\u20AE\u0192'
)
alphabet = list \
(
'\t\n\r абвгдеёзжийклмнопрстуфхцчшщьыъэюяАБВГДЕЁЗЖИЙКЛМНОПРСТУФХЦЧШЩЬЫЪЭЮЯ,.[]{}()=+-−*&^%$#@!~;:0123456789§/\|"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ ')
alphabet.append("'")
allowed = set(currencies + alphabet)
cleaned_text = [sym for sym in text if sym in allowed]
cleaned_text = ''.join(cleaned_text)
return cleaned_text
def process(pipeline, text='строчка текста или вербатим', keep_pos=True, keep_punct=False):
entities = {'PROPN'}
named = False
memory = []
mem_case = None
mem_number = None
tagged_propn = []
# обрабатываем текст, получаем результат в формате conllu:
processed = pipeline.process(text)
# пропускаем строки со служебной информацией:
content = [l for l in processed.split('\n') if not l.startswith('#')]
# извлекаем из обработанного текста леммы, тэги и морфологические характеристики
tagged = [w.split('\t') for w in content if w]
for t in tagged:
if len(t) != 10:
continue
(word_id, token, lemma, pos, xpos, feats, head, deprel, deps, misc) = t
token = clean_token(token, misc)
lemma = clean_lemma(lemma, pos)
if not lemma or not token:
continue
if pos in entities:
if '|' not in feats:
tagged_propn.append('%s_%s' % (lemma, pos))
continue
morph = {el.split('=')[0]: el.split('=')[1] for el in feats.split('|')}
if 'Case' not in morph or 'Number' not in morph:
tagged_propn.append('%s_%s' % (lemma, pos))
continue
if not named:
named = True
mem_case = morph['Case']
mem_number = morph['Number']
if morph['Case'] == mem_case and morph['Number'] == mem_number:
memory.append(lemma)
if 'SpacesAfter=\\n' in misc or 'SpacesAfter=\s\\n' in misc:
named = False
past_lemma = '::'.join(memory)
memory = []
tagged_propn.append(past_lemma + '_PROPN ')
else:
named = False
past_lemma = '::'.join(memory)
memory = []
tagged_propn.append(past_lemma + '_PROPN ')
tagged_propn.append('%s_%s' % (lemma, pos))
else:
if not named:
if pos == 'NUM' and token.isdigit(): # Заменяем числа на xxxxx той же длины
lemma = num_replace(token)
tagged_propn.append('%s_%s' % (lemma, pos))
else:
named = False
past_lemma = '::'.join(memory)
memory = []
tagged_propn.append(past_lemma + '_PROPN ')
tagged_propn.append('%s_%s' % (lemma, pos))
if not keep_punct:
tagged_propn = [word for word in tagged_propn if word.split('_')[1] != 'PUNCT']
if not keep_pos:
tagged_propn = [word.split('_')[0] for word in tagged_propn]
return tagged_propn
def tag_ud(model, text=['Текст нужно передать функции в виде строки!'], stoplist=[None]):
process_pipeline = Pipeline(model, 'tokenize', Pipeline.DEFAULT, Pipeline.DEFAULT, 'conllu')
result = []
#print('Processing input...', file=sys.stderr)
for line in text:
output = process(process_pipeline, text=line)
result.append([w for w in output if w.split('_')[0] not in stoplist])
return result
Download preprocessing model
udpipe_url = 'https://rusvectores.org/static/models/udpipe_syntagrus.model'
modelfile = wget.download(udpipe_url)
model_pp = Model.load(modelfile)
rus_brands = ['вискас', "пурина","гурмэ","гурме","фрискис","фрискиз","феликс","шеба","китекэт","китекет","роял","канин",
"педигри","цезарь","екануба","план","чаппи","чапи",'whiskas','purina','gourmet','friskies','felix','sheba',
'kitekat','royal','canin','pedigree','cesar','eukanuba','plan','chappi','шеб','гурм','пурин']
petfood = ['корм','корма','корму','кормов']
stop_list = [b.lower() for b in df.brand.unique().tolist()]+rus_brands+stopwords.words('russian')+petfood
text = ['Шла Саша по шоссе и сосала сушку', 'Ваша киска купила бы Вискас!', 'И когда же меня отпустит?!']
for t,m in zip(text, tag_ud(model_pp, text, stoplist=stop_list)):
print('\nFROM: \t%s \nTO: \t%s\n'%(t,m))
corpus = tag_ud(model_pp, text=df.verbatim.tolist(), stoplist=stop_list)
print('We transofrm verbatims from this:\t',df.verbatim[7],'\nInto this form:\t\t\t\t',corpus[7])
At first we'll download the pre-trained language model from web (608 mb with 788 mln. Russian words)
model_url = 'http://vectors.nlpl.eu/repository/11/182.zip'
m = wget.download(model_url)
model_file = model_url.split('/')[-1]
Than load this model for further usage
with zipfile.ZipFile(model_file, 'r') as archive:
stream = archive.open('model.bin')
model = gensim.models.KeyedVectors.load_word2vec_format(stream, binary=True)
Language Model Demonstration:
word = 'запах_NOUN'
print('Five closest words to word "%s" by meaning'%word)
for i in model.most_similar(positive=[word], topn=5):
print('\t',i[0])
print('Words "человек" and "кот" similarity index:',model.similarity('человек_NOUN', 'кот_NOUN'))
print('Words "собака" and "кот" similarity index:',model.similarity('собака_NOUN', 'кот_NOUN'))
print('Words "кошка" and "кот" similarity index:',model.similarity('кошка_NOUN', 'кот_NOUN'))
word_seq = 'яблоко_NOUN груша_NOUN виноград_NOUN банан_NOUN лимон_NOUN картофель_NOUN'
print('Find the extra word from sequence %s'%word_seq)
print('\t',model.doesnt_match(list('яблоко_NOUN груша_NOUN виноград_NOUN банан_NOUN лимон_NOUN картофель_NOUN'.split())))
print('Solve word equation: пицца - италия + россия = ... ',end='')
print(model.most_similar(positive=['пицца_NOUN', 'россия_NOUN'], negative=['италия_NOUN'])[0][0].split('_')[0])
print('Solve word equation: принцесса - женщина + мужчина = ... ',end='')
print(model.most_similar(positive=['принцесса_NOUN', 'мужчина_NOUN'], negative=['женщина_NOUN'])[0][0].split('_')[0])
Amazing, isn't it!?
print('So using this model we transform each word into %i length vector, e.g. %s is transformed into:'
%(len(model['кошка_NOUN']),'кошка_NOUN'))
print('\n',model['кошка_NOUN'])
However, some words may still be absent in model vocabulary. And if these are the only words in a sentence, it will become empty. We have to exclude such sentences from calculation.
corpus = [[w for w in sent if w in model] for sent in corpus]
Lets label zero-length sentences (i.e. sentenses with only stopwords or brands in them)
df['cluster'] = [len(x)==0 for x in corpus]
df['cluster'] = df.cluster.apply(lambda x: 'nothing' if x else np.nan)
corpus = [x for x in corpus if len(x)!=0]
df_plot = (pd.DataFrame(data={'brand':df.brand.unique()})
.merge(df.loc[df.cluster.isna(),['brand','category']]
.groupby('brand', as_index=False)
.count()
.rename(columns={'category':'meaning'}),how='left')
.merge(df.loc[df.cluster=='nothing',['brand','category']]
.groupby('brand', as_index=False)
.count()
.rename(columns={'category':'nothing'}),how='left')
.fillna(0)
.sort_values('meaning'))
meaning = go.Bar(x = df_plot['meaning'], y = df_plot['brand'], text = df_plot['meaning'],
orientation = 'h', textposition = 'auto', textfont = dict(size=16),
marker = dict(color = 'orange'), name = 'Verbatims with meaning')
nothing = go.Bar(x = df_plot['nothing'], y = df_plot['brand'], text = df_plot['nothing'],
orientation = 'h', textposition = 'auto', textfont = dict(size=16),
marker = dict(color = 'grey'), name = 'Verbatims with no meaning')
data = [meaning, nothing]
layout = go.Layout(title = 'Share of verbatims with meaning by brand',barmode = 'stack',bargap = 0.15,
xaxis=dict(showgrid=False, showline=False, showticklabels=False,tickfont=dict(size=38)))
pyo.iplot(go.Figure(data=data, layout=layout))
print('Examples of verbatims with no meaning:')
for v in np.random.choice(df[df.cluster=='nothing'].verbatim.values, size=5, replace=False):
print('\t%s'%v)
Great! The majority of verbatims are available for clustering in each brand.
There are several ways to calculate vector representation for whole sentences having vector representations for the containing words: one can use vector sum or average, but this way a lot of information is being lost.
Much better approach is to through weihted sum using SIF weights. Put it simple, we devaluate not unique words (that appear in many sentences) and give more weight to unique words, as they are more defining for the exact verbatim. For more information please refer to this article.
# A SIMPLE BUT TOUGH TO BEAT BASELINE FOR SENTENCE EMBEDDINGS
# Sanjeev Arora, Yingyu Liang, Tengyu Ma
# Princeton University
# convert a list of sentence with word2vec items into a set of sentence vectors
def sentence_to_vec(sentence_list, word_weights, model, embedding_size=300, a: float=1e-3):
sentence_set = []
for sentence in sentence_list:
vs = np.zeros(embedding_size) # add all word2vec values into one vector for the sentence
weights = np.zeros(embedding_size)
for word in sentence:
try:
a_value = a / (a + word_weights[word]) # smooth inverse frequency, SIF
weights= np.add(weights, a_value)
vs = np.add(vs, np.multiply(a_value, model[word])) # vs += sif * word_vector
except:
continue
vs = np.divide(vs, weights) # weighted average
sentence_set.append(vs) # add to our existing re-calculated set of sentences
# calculate PCA of this sentence set
pca = PCA()
pca.fit(np.array(sentence_set))
u = pca.components_[0] # the PCA vector
u = np.multiply(u, np.transpose(u)) # u x uT
# resulting sentence vectors, vs = vs -u x uT x vs
sentence_vecs = []
for vs in sentence_set:
sub = np.multiply(u,vs)
sentence_vecs.append(np.subtract(vs, sub))
return sentence_vecs
Calculate word weights:
counter_dict = dict(Counter([item for sublist in corpus for item in sublist]))
corpus_size = len(corpus)
word_weights = {k:v/corpus_size for (k,v) in counter_dict.items()}
train_data = sentence_to_vec(corpus, word_weights, model)
print('So we got %i verbatims embeded into %i-size vectors'%(np.shape(train_data)[0],np.shape(train_data)[1]))
Let's check some random verbatims for the closest ones... We'll do it with vector cosine similarity.
def cos_similarity(vec1, vec2):
dot = np.dot(vec1, vec2)
norm1 = np.linalg.norm(vec1)
norm2 = np.linalg.norm(vec2)
return dot / (norm1*norm2)
def find_similar_verbatims(verbatims, verb_vectors, n_similar):
verb_n = np.random.choice(len(verbatims))
v = verbatims[verb_n]
print('Finding %i similar verbatims for phraze:\n%s'%(n_similar, v))
v_vec = verb_vectors[verb_n]
sims = [cos_similarity(v_vec, i) for i in verb_vectors]
temp = pd.DataFrame({'verbatims':verbatims,'sims':sims}).sort_values('sims', ascending=False)
return temp[temp.verbatims!=v]['verbatims'].head(n_similar)
verbs = df[df.cluster!='nothing'].verbatim.values
find_similar_verbatims(verbs, train_data, 10)
Looks quite good, seems like our language model managed to derive some structure from verbatim!
We'll reduce dimensions to 2 and look at vectors:
tsne = TSNE(n_components=2, init='random', random_state=42, perplexity=50)
t0 = time()
train_tsne = tsne.fit_transform(train_data)
t1 = time()
print('Completed in %.2g sec'%(t1-t0))
tsne_data = pd.DataFrame(train_tsne, columns=['one','two'])
traces = [go.Scatter(x = tsne_data['one'],
y = tsne_data['two'],
mode='markers',
marker = dict(size=3,opacity=0.8))]
layout = go.Layout(title = '300d to 2d T-SNE transformed Verbatim Vectors')
fig = go.Figure(data=traces, layout=layout)
pyo.iplot(fig)
Looks pretty, doesn't it? :) The majority is wide-spread, but some zones show high density.
max_n_clusters = 20
inert = []
for k in tqdm_notebook(range(1, max_n_clusters+1)):
kmeans = KMeans(n_clusters=k, random_state=1).fit(train_data)
inertia = np.sqrt(kmeans.inertia_)
inert.append(np.sqrt(kmeans.inertia_))
traces = [go.Scatter(x=list(range(1,len(inert)+1)), y=inert, mode='lines+markers')]
layout = go.Layout(title='KMeans Inertia Decay', xaxis={'title':'# clusters'}, yaxis={'showticklabels':False,'showline':False})
fig = go.Figure(data=traces, layout=layout)
pyo.iplot(fig)
Using an 'Elbow-Rule' let's go with 8 clusters! Now for stability we'll re-cluster the cluster centers...
n_cluster = 8
centers = []
for i in tqdm_notebook(range(50)):
kmeans = KMeans(n_clusters=n_cluster, random_state=i).fit(train_data)
centers.append(kmeans.cluster_centers_)
final_kmeans = KMeans(n_clusters=n_cluster).fit(np.vstack(centers))
preds = final_kmeans.predict(train_data)
df.loc[df.cluster!='nothing','cluster'] = preds
df.to_csv('clustered.csv',index=False)
Look at clusters:
tsne_data['cluster'] = preds
print(tsne_data['cluster'].value_counts())
traces = [go.Scatter(x = tsne_data[tsne_data.cluster==c]['one'],
y = tsne_data[tsne_data.cluster==c]['two'],
name='Cluster '+str(c),
mode='markers',
marker = dict(size=3,opacity=0.8))
for c in tsne_data.cluster.unique()]
layout = go.Layout(title = '2-d T-SNE transformed Sentence Vectors with Clusters')
fig = go.Figure(data=traces, layout=layout)
pyo.iplot(fig)
tsne_data['cluster'].value_counts()
We observe that clusters are not equal: cluster 0 and 4 are quite large, while clusters 2, 3, 5 and 7 are small. It is a common result in verbatims clustering: the majority of people say similar things.
We'll use a cosine similarity to measure similarity between sencence vector representation and cluster center
clust = 0
for kcenter in final_kmeans.cluster_centers_:
col_name = 'Def verbatims ' + str(clust)
df[col_name] = 0
df.loc[df.cluster!='nothing',col_name] = [cos_similarity(kcenter, v) for v in train_data]
clust+=1
df['cluster_defining'] = df.apply(lambda x: x['Def verbatims 0'] if x['cluster']==0 else
(x['Def verbatims 1'] if x['cluster']==1 else
(x['Def verbatims 2'] if x['cluster']==2 else
(x['Def verbatims 3'] if x['cluster']==3 else
(x['Def verbatims 4'] if x['cluster']==4 else
(x['Def verbatims 5'] if x['cluster']==5 else
(x['Def verbatims 6'] if x['cluster']==6 else
(x['Def verbatims 7'] if x['cluster']==7 else 0))))))),axis=1)
df = df.drop([col for col in df.columns if col.startswith('Def')], axis=1)
Finally we get the following data frame, where cluster_defining column is opposite to distance to the cluster center.
df.to_csv('clustered_1.csv',index=False)
for clust in sorted(df[df.cluster!='nothing']['cluster'].unique()):
print('Cluster %i size: %i'%(int(clust),len(df[df.cluster==clust])))
print('Most defining verbatims:')
temp = df.loc[df.cluster==clust,].sort_values('cluster_defining', ascending=False).verbatim.head(7)
for i in temp:
print('\t',i)
print('\n')
So we get following results:
Id doesn't look like ideal clustering now, mostly because we have 2 large clusters that can't be defined precisely.
Let's try a more complicated clustering approach...
On T-SNE plot we see, that data is more likely to be formed in elliptoids, not just squares. Terefore a series of gaussians may provide a better cluster representation. At least it worth trying :)
bic = []
aic = []
gmm_train = np.array(train_data)
for n in tqdm_notebook(np.arange(1, 21)):
mod = GMM(n, covariance_type='full', random_state=0).fit(gmm_train)
bic.append(mod.bic(gmm_train))
aic.append(mod.aic(gmm_train))
aic_bic_avg = np.array(pd.Series(aic) + pd.Series(bic))/2
n_clusters = np.arange(1, 21)
traces = [go.Scatter(x=n_clusters, y=bic, mode='lines+markers', name='BIC'),
go.Scatter(x=n_clusters, y=aic, mode='lines+markers', name='AIC'),
go.Scatter(x=n_clusters, y=aic_bic_avg, mode='lines+markers', name='average'),
go.Scatter(x=n_clusters, y=np.ones(20)*np.min(aic_bic_avg), mode='lines', name='minimum of average',
line={'dash':'dash','color':'grey'})]
layout = go.Layout(title='Choosing GMM number of clusters',
yaxis={'showticklabels':False,'showline':False},
xaxis={'title':'# clusters'})
fig = go.Figure(data=traces, layout=layout)
pyo.iplot(fig)
print('Best choice for number of clusters is %i'%(np.argmin(aic_bic_avg)+1))
10 clusters seem to be the best choice
gmm_model = GMM(10, covariance_type='full', random_state=0).fit(train_data)
gmm_preds = gmm_model.predict(train_data)
gmm_preds_proba = gmm_model.predict_proba(train_data)
tsne_data['gmm_cluster'] = gmm_preds
df['gmm_cluster'] = df['cluster']
df['gmm_proba'] = 0
df.loc[df.gmm_cluster!='nothing','gmm_cluster'] = gmm_preds
df.loc[df.gmm_cluster!='nothing','gmm_proba'] = gmm_preds_proba.max(axis=1)
df.to_csv('clustered_gmm.csv',index=False)
sns.set(rc={'figure.figsize':(15,10)})
sns.scatterplot(x=tsne_data['one'], y=tsne_data['two'], hue=tsne_data['gmm_cluster'],
legend='full', palette='Set1')
for clust in sorted(df[df.gmm_cluster!='nothing']['gmm_cluster'].unique()):
print('Cluster %i size: %i'%(int(clust),len(df[df.gmm_cluster==clust])))
print('Most defining verbatims:')
temp = df.loc[df.gmm_cluster==clust,].sort_values('gmm_proba', ascending=False).verbatim.head(15)
probs = df.loc[df.gmm_cluster==clust,].sort_values('gmm_proba', ascending=False).gmm_proba.head(15)
for i,j in zip(probs,temp):
print('\t[%.1f]'%i,j)
print('\n')
There is a structure, but the results don't seem more meaningful than in KMeans clustering. So we'll stick to our initial approach.
So we decided to use 8-clusters KMeans model. But 2 clusters (2 and 6) are still quite ambigious. Let's take them out and cluster again! :)
At first we'll filter train_data to keep only vectors of clusters 2 and 6
df = df.drop(['gmm_cluster','gmm_proba'], axis=1)
Make a new sentence data corpus
verbs_sec = df[df.cluster.isin([2,6])].verbatim.tolist()
corpus_second = tag_ud(model_pp, text=verbs_sec, stoplist=stop_list)
And create new vectors
counter_dict = dict(Counter([item for sublist in corpus_second for item in sublist]))
corpus_size = len(corpus_second)
word_weights = {k:v/corpus_size for (k,v) in counter_dict.items()}
train_second = sentence_to_vec(corpus_second, word_weights, model)
tsne = TSNE(n_components=2, init='random', random_state=42, perplexity=50)
t0 = time()
train_tsne = tsne.fit_transform(train_second)
t1 = time()
print('Completed in %.2g sec'%(t1-t0))
tsne_second = pd.DataFrame(train_tsne, columns=['one','two'])
traces = [go.Scatter(x = tsne_second['one'],
y = tsne_second['two'],
mode='markers',
marker = dict(size=3,opacity=0.8))]
layout = go.Layout(title = '300d to 2d T-SNE transformed Verbatim Vectors')
fig = go.Figure(data=traces, layout=layout)
pyo.iplot(fig)
And now to clustering. We'll stick only to K-Means this time.
max_n_clusters = 20
inert = []
for k in tqdm_notebook(range(1, max_n_clusters+1)):
kmeans = KMeans(n_clusters=k, random_state=1).fit(train_second)
inertia = np.sqrt(kmeans.inertia_)
inert.append(np.sqrt(kmeans.inertia_))
traces = [go.Scatter(x=list(range(1,len(inert)+1)), y=inert, mode='lines+markers')]
layout = go.Layout(title='KMeans Inertia Decay', xaxis={'title':'# clusters'}, yaxis={'showticklabels':False,'showline':False})
fig = go.Figure(data=traces, layout=layout)
pyo.iplot(fig)
11 clusters is our choice!
n_cluster = 11
centers = []
for i in tqdm_notebook(range(50)):
kmeans = KMeans(n_clusters=n_cluster, random_state=i).fit(train_second)
centers.append(kmeans.cluster_centers_)
final_kmeans = KMeans(n_clusters=n_cluster).fit(np.vstack(centers))
preds = final_kmeans.predict(train_second)+100 #add 100 to identify 2nd level clusters from 1st level ones
Look at clusters:
tsne_second['cluster'] = preds
print(tsne_second['cluster'].value_counts())
traces = [go.Scatter(x = tsne_second[tsne_second.cluster==c]['one'],
y = tsne_second[tsne_second.cluster==c]['two'],
name='Cluster '+str(c),
mode='markers',
marker = dict(size=3,opacity=0.8))
for c in tsne_second.cluster.unique()]
layout = go.Layout(title = '2-d T-SNE transformed Sentence Vectors with Clusters')
fig = go.Figure(data=traces, layout=layout)
pyo.iplot(fig)
df.loc[df.cluster.isin([2,6]),'cluster'] = preds
df.to_csv('clustered_2.csv',index=False)
Show 10 most defining verbatims for clusters.
clust = 0
for kcenter in final_kmeans.cluster_centers_:
col_name = 'Def verbatims ' + str(clust)
df[col_name] = 0
df.loc[df.cluster.isin(list(range(100,111))),col_name] = [cos_similarity(kcenter, v) for v in train_second]
clust+=1
df['cluster_defining_second'] = df.apply(lambda x: x['Def verbatims 0'] if x['cluster']==0 else
(x['Def verbatims 1'] if x['cluster']==1 else
(x['Def verbatims 2'] if x['cluster']==2 else
(x['Def verbatims 3'] if x['cluster']==3 else
(x['Def verbatims 4'] if x['cluster']==4 else
(x['Def verbatims 5'] if x['cluster']==5 else
(x['Def verbatims 6'] if x['cluster']==6 else
(x['Def verbatims 7'] if x['cluster']==7 else
(x['Def verbatims 8'] if x['cluster']==8 else
(x['Def verbatims 9'] if x['cluster']==9 else
(x['Def verbatims 10'] if x['cluster']==10 else 0)))))))))),axis=1)
df = df.drop([col for col in df.columns if col.startswith('Def')], axis=1)
for clust in sorted(df[df.cluster.isin(list(range(100,111)))]['cluster'].unique()):
print('Cluster %i size: %i'%(int(clust),len(df[df.cluster==clust])))
print('Most defining verbatims:')
temp = df.loc[df.cluster==clust,].sort_values('cluster_defining_second', ascending=False).verbatim.head(10)
for i in temp:
print('\t',i)
print('\n')
So we get following results:
Seems like we will be merging some clusters later.
Now we can see that we still have one cluster that is very diverse and not homogenious. We will do the last, 3rd attemp to group those verbatims somehow.
So we added another 9 clusters to previous 6, but clusters 100 and 102 are still very diverse. Let's try clustering them again!
verbs_3 = df[df.cluster.isin([100,102])].verbatim.tolist()
corpus_3 = tag_ud(model_pp, text=verbs_3, stoplist=stop_list)
counter_dict = dict(Counter([item for sublist in corpus_3 for item in sublist]))
corpus_size = len(corpus_3)
word_weights = {k:v/corpus_size for (k,v) in counter_dict.items()}
train_3 = sentence_to_vec(corpus_3, word_weights, model)
tsne = TSNE(n_components=2, init='random', random_state=42, perplexity=50)
t0 = time()
train_tsne = tsne.fit_transform(train_3)
t1 = time()
print('Completed in %.2g sec'%(t1-t0))
tsne_3 = pd.DataFrame(train_tsne, columns=['one','two'])
traces = [go.Scatter(x = tsne_3['one'],
y = tsne_3['two'],
mode='markers',
marker = dict(size=3,opacity=0.8))]
layout = go.Layout(title = '300d to 2d T-SNE transformed Verbatim Vectors')
fig = go.Figure(data=traces, layout=layout)
pyo.iplot(fig)
It is hard to find clusters here. Only few zones with something that may form clusters. Lets check it.
max_n_clusters = 20
inert = []
for k in tqdm_notebook(range(1, max_n_clusters+1)):
kmeans = KMeans(n_clusters=k, random_state=1).fit(train_3)
inertia = np.sqrt(kmeans.inertia_)
inert.append(np.sqrt(kmeans.inertia_))
traces = [go.Scatter(x=list(range(1,len(inert)+1)), y=inert, mode='lines+markers')]
layout = go.Layout(title='KMeans Inertia Decay', xaxis={'title':'# clusters'}, yaxis={'showticklabels':False,'showline':False})
fig = go.Figure(data=traces, layout=layout)
pyo.iplot(fig)
n_cluster = 5
centers = []
for i in tqdm_notebook(range(50)):
kmeans = KMeans(n_clusters=n_cluster, random_state=i).fit(train_3)
centers.append(kmeans.cluster_centers_)
final_kmeans = KMeans(n_clusters=n_cluster).fit(np.vstack(centers))
preds = final_kmeans.predict(train_3)+200 #add 200 to identify 3rd level clusters from 1st and 2nd ones
tsne_3['cluster'] = preds
print(tsne_3['cluster'].value_counts())
traces = [go.Scatter(x = tsne_3[tsne_3.cluster==c]['one'],
y = tsne_3[tsne_3.cluster==c]['two'],
name='Cluster '+str(c),
mode='markers',
marker = dict(size=3,opacity=0.8))
for c in tsne_3.cluster.unique()]
layout = go.Layout(title = '2-d T-SNE transformed Sentence Vectors with Clusters')
fig = go.Figure(data=traces, layout=layout)
pyo.iplot(fig)
df.loc[df.cluster.isin([100,102]),'cluster'] = preds
df.to_csv('clustered.csv',index=False)
clust = 0
for kcenter in final_kmeans.cluster_centers_:
col_name = 'Def verbatims ' + str(clust)
df[col_name] = 0
df.loc[df.cluster.isin(list(range(200,205))),col_name] = [cos_similarity(kcenter, v) for v in train_3]
clust+=1
df['cluster_defining_third'] = df.apply(lambda x: x['Def verbatims 0'] if x['cluster']==0 else
(x['Def verbatims 1'] if x['cluster']==1 else
(x['Def verbatims 2'] if x['cluster']==2 else
(x['Def verbatims 3'] if x['cluster']==3 else
(x['Def verbatims 4'] if x['cluster']==4 else 0)))),axis=1)
df = df.drop([col for col in df.columns if col.startswith('Def')], axis=1)
for clust in sorted(df[df.cluster.isin(list(range(200,205)))]['cluster'].unique()):
print('Cluster %i size: %i'%(int(clust),len(df[df.cluster==clust])))
print('Most defining verbatims:')
temp = df.loc[df.cluster==clust,].sort_values('cluster_defining_second', ascending=False).verbatim.head(10)
for i in temp:
print('\t',i)
print('\n')
Yay! It worked! We still have mixed type clusters, but we got some new stuff!
So we get following results:
It seems like we managed to derive all structure from verbatims that we could by mathimatical means. Let's analyze what we got here...
Lets combine them a little bit:
df['cluster'] = df['cluster'].astype(str)
sorted(df['cluster'].unique())
cluster_names = ['tasty','pricing concerns','pricing concerns','good quality','good quality','dont know','good quality',
'useful, healthy','pricing concerns','dont know','premium, elite','pricing concerns','unclustered',
'unclustered','unclustered','good quality','pure facts','saw advertizing','pure facts','pure facts',
'nothing']
df = df.merge(pd.DataFrame({'cluster':sorted(df['cluster'].unique()),'cluster_names':cluster_names}), how='left')
df = df.drop(['cluster_defining','cluster_defining_second','cluster_defining_third','cluster'], axis=1)
clust_to_remove = ['unclustered','nothing','dont know','pure facts']
df_cat = (df.loc[df['cluster_names'].isin(clust_to_remove)==False,['category','cluster_names','brand']]
.groupby(['category','cluster_names'], as_index=False).count().sort_values('cluster_names'))
fig = {
'data':[
{
'values':df_cat.loc[df_cat.category=='CAT','brand'],
'labels':df_cat.loc[df_cat.category=='CAT','cluster_names'],
'domain':{'column':0},
'name':'CAT',
'hole':.4,
'type':'pie'
},
{
'values':df_cat.loc[df_cat.category=='DOG','brand'],
'labels':df_cat.loc[df_cat.category=='DOG','cluster_names'],
'domain':{'column':1},
'name':'DOG',
'hole':.4,
'type':'pie'
}
],
'layout':{
'title':'Verbatims Clusters by Category (only meaningful clusters)',
'grid':{'rows':1,'columns':2},
'annotations':[
{
'font':{'size':24},
'showarrow':False,
'text':'CAT',
'x':.2,
'y':.5
},
{
'font':{'size':24},
'showarrow':False,
'text':'DOG',
'x':.8,
'y':.5
}
],
'showlegend':True
}
}
pyo.iplot(fig)
Let's look at brand's profiles. We have these brands:
df[['category','brand']].drop_duplicates()
df_weights = pd.DataFrame(df.cluster_names.value_counts()).reset_index().rename(columns={'index':'cluster_names',
'cluster_names':'counts'})
df_weights['counts'] = 1/df_weights['counts']
df = df.merge(df_weights, how='left')
Define a function to easily plot brand cluster profile.
def plot_brand_clusters(df, category, brand, color):
clust_to_remove = ['unclustered','nothing','dont know','pure facts']
df_slice = (df.loc[(df.category==category)&
(df.brand==brand)&
(df['cluster_names'].isin(clust_to_remove)==False),['cluster_names','counts']]
.groupby('cluster_names', as_index=False).sum()
.sort_values('counts')
.rename(columns={'cluster_names':'cluster'}))
divider = sum(df_slice['counts'])
df_slice['counts'] = df_slice['counts'].apply(lambda x: str(round(x/divider*100,1))+"%")
trace = [
go.Bar(x = df_slice['counts'],y = df_slice['cluster'],
text = df_slice['counts'],
orientation = 'h',
textposition = 'auto',
textfont = dict(size=18),
marker = dict(color = color),
name = category+" "+brand)
]
layout = go.Layout(title = '{} {} verbatims profile'.format(category, brand),
bargap = 0.15, yaxis=dict(automargin=True),
xaxis=dict(showgrid=False, showline=False, showticklabels=False,tickfont=dict(size=38)))
fig = go.Figure(data=trace, layout=layout)
pyo.iplot(fig)
I am not going to explore every brand, just show some of them
plot_brand_clusters(df, 'CAT', 'Perfect Fit', 'grey')
plot_brand_clusters(df, 'CAT', 'Sheba', 'rgb(204,204,0)')
plot_brand_clusters(df, 'CAT', 'Felix', 'rgb(47,85,151)')
plot_brand_clusters(df, 'DOG', 'Royal Canin', 'red')
plot_brand_clusters(df, 'DOG', 'Pro Plan', 'black')
Finally, let's apply some aethetics and make shiny word clouds!
Our text preparation model helps easily complete this task.
# Snapshot
df.to_csv('clustered_final.csv',index=False)
corpus_full = tag_ud(model_pp, text=df.verbatim.tolist(), stoplist=stop_list)
df_cloud = pd.DataFrame(data={'category':df['category'],'brand':df['brand'],'cluster':[len(x)==0 for x in corpus_full]})
df_cloud['cluster'] = df_cloud.cluster.apply(lambda x: np.nan if x else 'keep')
df_cloud = df_cloud.dropna()
corpus_full = [x for x in corpus_full if len(x)!=0]
df_cloud = df_cloud.reset_index(drop=True).drop('cluster', axis=1)
In corpus_full we have words combined with phrase parts, such as кошка_NOUN.
For Word Clouds we can remove this phrase part and transform word lists into one sentences
corpus_clean = [' '.join([w.split('_')[0] for w in sent]) for sent in corpus_full]
corpus_clean[:10]
df_cloud.head(3)
len(df_cloud)==len(corpus_clean)
# Snapshot
df_cloud.to_csv('wordcloud_dataframe.csv',index=False)
with open('corpus_clean.pkl', 'wb') as f:
pickle.dump(corpus_clean, f)
Now that we have data frame of categories and brands and the same size prepared verbatims dictionary we can combine a long sentences for each brand-category
cat_brands = df_cloud.drop_duplicates().reset_index(drop=True)
cat_brands['text'] = np.nan
for x in cat_brands.index:
c = cat_brands.iloc[x].category
b = cat_brands.iloc[x].brand
ind = list(df_cloud[(df_cloud.category==c)&(df_cloud.brand==b)].index)
cat_brands.loc[cat_brands.index==x, 'text'] = " ".join([corpus_clean[i] for i in ind])
cat_brands['text_length'] = cat_brands.text.apply(len)
cat_brands.head(5)
Let's take a look at word frequencies and drop some too frequent but meaningless words.
w_counts = dict(Counter((" ".join([t for t in cat_brands.text]).split())))
w_freq = pd.DataFrame(data={'words':[k for k,v in w_counts.items()],
'counts':[v for k,v in w_counts.items()]}).sort_values('counts', ascending=False)
print('Total %i words in vocabulary left'%len(w_freq))
w_freq.head(30)
We can see some meaningless words in the top of counts. Let's get rid of them!
meaningless_words = ['кошка','собака','марка','ничто','кот','знать','очень', 'еда','нравиться',
'животное','хороший','упаковка','покупать','самый','который','цена']
text_cleaned = []
for t in cat_brands.text:
t_clean = " ".join([i for i in t.split() if i not in meaningless_words])
text_cleaned.append(t_clean)
cat_brands['text_cleaned'] = text_cleaned
cat_brands['text__cleaned_length'] = cat_brands.text_cleaned.apply(len)
cat_brands
print('Just a test wordcloud for %s %s'%(cat_brands.iloc[3].category, cat_brands.iloc[3].brand))
text = cat_brands.text_cleaned[3]
# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)
# Display the generated image:
plt.figure(figsize=[20,10])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
print('And another test wordcloud for %s %s'%(cat_brands.iloc[7].category, cat_brands.iloc[7].brand))
text = cat_brands.text_cleaned[7]
# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)
# Display the generated image:
plt.figure(figsize=[20,10])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
print('And another test wordcloud for %s %s'%(cat_brands.iloc[11].category, cat_brands.iloc[11].brand))
text = cat_brands.text_cleaned[11]
# Create and generate a word cloud image:
wordcloud = WordCloud().generate(text)
# Display the generated image:
plt.figure(figsize=[20,10])
plt.imshow(wordcloud, interpolation='bilinear')
plt.axis("off")
plt.show()
# Snapshot
cat_brands.to_csv('cat_brands.csv', index=False)
cat_brands = pd.read_csv('cat_brands.csv')
For wordcloud library we need to convert font into 255 numbers
The second function loads prints word cloud and also saves it into a subfolder.
def transform_image_mask(img_name = 'cat_sit'):
mask = np.array(Image.open('./silouettes/'+img_name+'.png'))
initial_font_number = mask[0][0][0]
transformed = np.ndarray((mask.shape[0],mask.shape[1]), np.int32)
for x in range(len(mask)):
transformed[x] = [255-i+initial_font_number for i in mask[x].T[1]]
return transformed
def print_word_cloud(dataframe, img_name = 'cat_sit', category='CAT', brand='Whiskas',
background_color="white", contour_color='purple', contour_width=2, max_words=1000):
mask = transform_image_mask(img_name)
wc = WordCloud(background_color=background_color, max_words=max_words, mask=mask,
contour_width=contour_width, contour_color=contour_color)
text = dataframe.loc[(dataframe.category==category)&(dataframe.brand==brand),'text_cleaned'].values[0]
wc.generate(text)
file_name = category+'_'+brand+'_wordcloud'+'.png'
wc.to_file('word_clouds/'+file_name)
plt.figure(figsize=[20,10])
plt.imshow(wc, interpolation='bilinear')
plt.axis("off")
plt.title(' '.join([category, brand,'wordcloud']))
plt.show()
Se we'are geared and ready to draw our own unique word clouds!
print_word_cloud(cat_brands, img_name = 'cat_sit', category='CAT', brand='Whiskas',
background_color="white", contour_color='purple', contour_width=4, max_words=1000)
print_word_cloud(cat_brands, img_name = 'cat_walk', category='CAT', brand='Kitekat',
background_color="white", contour_color='green', contour_width=4, max_words=1000)
print_word_cloud(cat_brands, img_name = 'cat_stratch', category='CAT', brand='Felix',
background_color="white", contour_color='blue', contour_width=4, max_words=1000)
print_word_cloud(cat_brands, img_name = 'dog_sit', category='DOG', brand='Pedigree',
background_color="white", contour_color='orange', contour_width=4, max_words=1000)
print_word_cloud(cat_brands, img_name = 'dog_stand', category='DOG', brand='Cesar',
background_color="white", contour_color='black', contour_width=4, max_words=1000)